Concrete Dataset Analysis¶

In [1]:
#%pip install pandas numpy matplotlib seaborn missingno autogluon shapley
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


%matplotlib inline

EDA¶

In [3]:
df = pd.read_csv("dataset.csv", sep=";").reset_index(drop=True)
In [4]:
df.drop(columns=["Unnamed: 0", "id"], inplace=True)
In [5]:
df.head()
Out[5]:
CementComponent BlastFurnaceSlag FlyAshComponent WaterComponent SuperplasticizerComponent CoarseAggregateComponent FineAggregateComponent AgeInDays Strength
0 525.0 0.0 0.0 186.0 0.0 1125.0 613.0 3.0 10.38
1 143.0 169.0 143.0 191.0 8.0 967.0 643.0 28.0 23.52
2 289.0 134.7 0.0 185.7 0.0 1075.0 795.3 28.0 36.96
3 304.0 76.0 0.0 228.0 0.0 932.0 670.0 365.0 39.05
4 157.0 236.0 0.0 192.0 0.0 935.4 781.2 NaN 74.19
In [6]:
df.columns
Out[6]:
Index(['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5407 entries, 0 to 5406
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CementComponent            5194 non-null   float64
 1   BlastFurnaceSlag           5079 non-null   float64
 2   FlyAshComponent            5258 non-null   float64
 3   WaterComponent             5044 non-null   float64
 4   SuperplasticizerComponent  5178 non-null   float64
 5   CoarseAggregateComponent   5095 non-null   float64
 6   FineAggregateComponent     5133 non-null   float64
 7   AgeInDays                  5062 non-null   float64
 8   Strength                   5407 non-null   float64
dtypes: float64(9)
memory usage: 380.3 KB
In [8]:
df.describe()
Out[8]:
CementComponent BlastFurnaceSlag FlyAshComponent WaterComponent SuperplasticizerComponent CoarseAggregateComponent FineAggregateComponent AgeInDays Strength
count 5194.000000 5079.000000 5258.000000 5044.000000 5178.000000 5095.000000 5133.000000 5062.000000 5407.000000
mean 299.073585 58.557531 31.976949 185.067407 4.120614 991.653185 771.449834 51.344528 35.452071
std 105.800354 83.286366 54.667373 18.510484 5.686415 77.165369 78.510465 69.752855 16.401896
min 102.000000 0.000000 0.000000 121.800000 0.000000 801.000000 594.000000 1.000000 2.330000
25% 213.500000 0.000000 0.000000 175.100000 0.000000 938.200000 734.300000 7.000000 23.640000
50% 297.200000 0.000000 0.000000 187.400000 0.000000 978.000000 781.200000 28.000000 33.950000
75% 375.000000 122.600000 79.000000 192.000000 8.100000 1047.000000 821.000000 56.000000 45.850000
max 540.000000 359.400000 200.100000 247.000000 32.200000 1145.000000 992.600000 365.000000 82.600000
In [9]:
# missing values
df.isnull().sum()
Out[9]:
CementComponent              213
BlastFurnaceSlag             328
FlyAshComponent              149
WaterComponent               363
SuperplasticizerComponent    229
CoarseAggregateComponent     312
FineAggregateComponent       274
AgeInDays                    345
Strength                       0
dtype: int64
In [10]:
# percentuale di missing values in una notazione percentuale
df.isnull().sum() / len(df) *100
Out[10]:
CementComponent              3.939338
BlastFurnaceSlag             6.066210
FlyAshComponent              2.755687
WaterComponent               6.713520
SuperplasticizerComponent    4.235251
CoarseAggregateComponent     5.770298
FineAggregateComponent       5.067505
AgeInDays                    6.380618
Strength                     0.000000
dtype: float64

todo trovare cosa fare con i missing values¶

In [11]:
msno.matrix(df)
msno.heatmap(df)
Out[11]:
<Axes: >
No description has been provided for this image
No description has been provided for this image

proviamo con o un simple imputer usando la mediana dato che gli histogram sono skewed o con un knn che dovrebbe essere robusto

In [12]:
import missingno as msno

msno.bar(df)
msno.matrix(df)
msno.heatmap(df)
msno.dendrogram(df)
msno.dendrogram(df, figsize=(10, 10))
msno.dendrogram(df, figsize=(10, 10))
msno.dendrogram(df, figsize=(10, 10))
Out[12]:
<Axes: >
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [13]:
# histogram for each feature
for feature in df.columns:
    sns.histplot(df[feature])
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

try to impute with knn

In [ ]:
from sklearn.impute import KNNImputer

# copy dataframe and run KNN imputation
df_imputed_knn = df.copy()
imputer = KNNImputer(n_neighbors=5, weights='distance')
df_imputed_knn = pd.DataFrame(imputer.fit_transform(df_imputed_knn),
                              columns=df.columns, index=df.index)

# save imputed copy
df_imputed_knn.to_csv("dataset_imputed_knn.csv", index=False)

# quick check
print("Missing before:\n", df.isnull().sum())
print("\nMissing after (KNN imputed):\n", df_imputed_knn.isnull().sum())
In [23]:
# median before and after
print("Median before:\n", df.median())
print("\nMedian after (KNN imputed):\n", df_imputed_knn.median())
Median before:
 CementComponent              297.20
BlastFurnaceSlag               0.00
FlyAshComponent                0.00
WaterComponent               187.40
SuperplasticizerComponent      0.00
CoarseAggregateComponent     978.00
FineAggregateComponent       781.20
AgeInDays                     28.00
Strength                      33.95
dtype: float64

Median after (KNN imputed):
 CementComponent              297.200000
BlastFurnaceSlag               0.000000
FlyAshComponent                0.000000
WaterComponent               188.448466
SuperplasticizerComponent      0.000000
CoarseAggregateComponent     978.000000
FineAggregateComponent       781.200000
AgeInDays                     28.000000
Strength                      33.950000
dtype: float64
In [37]:
# ANALISI OUTLIERS - Comprehensive Outlier Detection
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import IsolationForest

print("="*70)
print("OUTLIER ANALYSIS - Concrete Dataset")
print("="*70)

# Use the imputed dataset
df_analysis = df_imputed_knn.copy()

# 1. STATISTICAL SUMMARY
print("\n" + "="*70)
print("1. STATISTICAL SUMMARY")
print("="*70)
print(df_analysis.describe())

# 2. Z-SCORE METHOD (outliers > 3 standard deviations)
print("\n" + "="*70)
print("2. Z-SCORE METHOD (|z| > 3)")
print("="*70)

z_scores = pd.DataFrame(np.abs(stats.zscore(df_analysis)), columns=df_analysis.columns, index=df_analysis.index)
outliers_zscore = (z_scores > 3)

print("\nOutliers per feature (Z-score > 3):")
for col in df_analysis.columns:
    count = outliers_zscore[col].sum()
    percentage = (count / len(df_analysis)) * 100
    print(f"{col:30s}: {count:4d} outliers ({percentage:5.2f}%)")

print(f"\nTotal outlier data points: {outliers_zscore.sum().sum()}")
rows_with_outliers = outliers_zscore.any(axis=1).sum()
print(f"Rows containing at least one outlier: {rows_with_outliers} ({rows_with_outliers/len(df_analysis)*100:.2f}%)")

# 3. IQR METHOD (Interquartile Range)
print("\n" + "="*70)
print("3. IQR METHOD (beyond 1.5*IQR)")
print("="*70)

outliers_iqr = pd.DataFrame(index=df_analysis.index)
outliers_count_iqr = {}
iqr_bounds = {}

for col in df_analysis.columns:
    Q1 = df_analysis[col].quantile(0.25)
    Q3 = df_analysis[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_iqr[col] = (df_analysis[col] < lower_bound) | (df_analysis[col] > upper_bound)
    outliers_count_iqr[col] = outliers_iqr[col].sum()
    iqr_bounds[col] = (lower_bound, upper_bound)
    
    print(f"{col:30s}: {outliers_count_iqr[col]:4d} outliers ({outliers_count_iqr[col]/len(df_analysis)*100:5.2f}%) "
          f"[{lower_bound:8.2f}, {upper_bound:8.2f}]")

rows_with_outliers_iqr = outliers_iqr.any(axis=1).sum()
print(f"\nRows containing at least one outlier: {rows_with_outliers_iqr} ({rows_with_outliers_iqr/len(df_analysis)*100:.2f}%)")

# 4. VISUALIZATION - BOXPLOTS
print("\n" + "="*70)
print("4. BOXPLOT VISUALIZATION")
print("="*70)

fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, col in enumerate(df_analysis.columns):
    ax = axes[idx]
    bp = ax.boxplot(df_analysis[col], vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightblue', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2),
                    whiskerprops=dict(color='blue', linewidth=1.5),
                    capprops=dict(color='blue', linewidth=1.5),
                    flierprops=dict(marker='o', markerfacecolor='red', markersize=6, alpha=0.5))
    
    ax.set_title(f'{col}\n({outliers_count_iqr[col]} outliers)', fontsize=10, fontweight='bold')
    ax.set_ylabel('Value')
    ax.grid(axis='y', alpha=0.3)

plt.suptitle('Outlier Detection - Boxplots for All Features', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

# 5. HISTOGRAM + DISTRIBUTION
print("\n" + "="*70)
print("5. DISTRIBUTION ANALYSIS")
print("="*70)

fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, col in enumerate(df_analysis.columns):
    ax = axes[idx]
    
    # Histogram
    ax.hist(df_analysis[col], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    
    # Get bounds
    lower_bound, upper_bound = iqr_bounds[col]
    
    ax.axvline(lower_bound, color='red', linestyle='--', linewidth=2, label=f'Lower: {lower_bound:.2f}')
    ax.axvline(upper_bound, color='red', linestyle='--', linewidth=2, label=f'Upper: {upper_bound:.2f}')
    ax.axvline(df_analysis[col].median(), color='green', linestyle='-', linewidth=2, label=f'Median: {df_analysis[col].median():.2f}')
    
    ax.set_title(f'{col}', fontsize=10, fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.legend(fontsize=7)
    ax.grid(alpha=0.3)

plt.suptitle('Distribution Analysis with Outlier Boundaries', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

# 6. SCATTER PLOTS - Outliers vs Strength
print("\n" + "="*70)
print("6. FEATURE vs STRENGTH (identifying influential outliers)")
print("="*70)

fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, col in enumerate(df_analysis.columns):
    if col == 'Strength':
        axes[idx].axis('off')
        continue
    
    ax = axes[idx]
    
    # Regular points
    mask_normal = ~outliers_iqr[col]
    ax.scatter(df_analysis.loc[mask_normal, col], 
              df_analysis.loc[mask_normal, 'Strength'],
              alpha=0.5, s=20, c='blue', label='Normal')
    
    # Outlier points
    mask_outlier = outliers_iqr[col]
    if mask_outlier.sum() > 0:
        ax.scatter(df_analysis.loc[mask_outlier, col], 
                  df_analysis.loc[mask_outlier, 'Strength'],
                  alpha=0.7, s=50, c='red', marker='x', label='Outlier')
    
    ax.set_xlabel(col, fontsize=9)
    ax.set_ylabel('Strength', fontsize=9)
    ax.set_title(f'{col} vs Strength', fontsize=10, fontweight='bold')
    ax.legend(fontsize=7)
    ax.grid(alpha=0.3)

plt.suptitle('Features vs Target - Highlighting Outliers', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

# 7. MULTIVARIATE OUTLIERS - Isolation Forest
print("\n" + "="*70)
print("7. MULTIVARIATE OUTLIER DETECTION - Isolation Forest")
print("="*70)

# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers_iso = iso_forest.fit_predict(df_analysis)
outliers_iso_mask = outliers_iso == -1

print(f"Outliers detected by Isolation Forest: {outliers_iso_mask.sum()} ({outliers_iso_mask.sum()/len(df_analysis)*100:.2f}%)")

# Visualize anomaly scores
anomaly_scores = iso_forest.score_samples(df_analysis)
plt.figure(figsize=(12, 6))
plt.hist(anomaly_scores, bins=50, alpha=0.7, color='purple', edgecolor='black')
plt.axvline(np.percentile(anomaly_scores, 10), color='red', linestyle='--', linewidth=2, label='10th percentile (outlier threshold)')
plt.xlabel('Anomaly Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Isolation Forest - Anomaly Score Distribution', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# 8. SUMMARY TABLE
print("\n" + "="*70)
print("8. OUTLIER DETECTION SUMMARY")
print("="*70)

summary_data = {
    'Method': ['Z-Score (|z| > 3)', 'IQR (1.5*IQR)', 'Isolation Forest'],
    'Outlier Rows': [rows_with_outliers, rows_with_outliers_iqr, outliers_iso_mask.sum()],
    'Percentage': [
        f"{rows_with_outliers/len(df_analysis)*100:.2f}%",
        f"{rows_with_outliers_iqr/len(df_analysis)*100:.2f}%",
        f"{outliers_iso_mask.sum()/len(df_analysis)*100:.2f}%"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# 9. CREATE DATASETS WITH/WITHOUT OUTLIERS
print("\n" + "="*70)
print("9. CREATING CLEANED DATASETS")
print("="*70)

# Dataset without IQR outliers
df_no_outliers_iqr = df_analysis[~outliers_iqr.any(axis=1)].copy()
print(f"Original dataset: {len(df_analysis)} rows")
print(f"Dataset without IQR outliers: {len(df_no_outliers_iqr)} rows (removed {len(df_analysis) - len(df_no_outliers_iqr)} rows)")

# Dataset without Isolation Forest outliers
df_no_outliers_iso = df_analysis[~outliers_iso_mask].copy()
print(f"Dataset without Isolation Forest outliers: {len(df_no_outliers_iso)} rows (removed {len(df_analysis) - len(df_no_outliers_iso)} rows)")

# 10. RECOMMENDATION
print("\n" + "="*70)
print("10. RECOMMENDATIONS")
print("="*70)
print("""
Based on the outlier analysis:

1. IQR Method: Most sensitive - detects feature-wise outliers
2. Z-Score: Conservative - only extreme statistical outliers
3. Isolation Forest: Detects multivariate outliers (anomalies in feature combinations)

NEXT STEPS:
✓ Created df_no_outliers_iqr (without IQR outliers)
✓ Created df_no_outliers_iso (without Isolation Forest outliers)

RECOMMENDATIONS:
- Test models on BOTH datasets (with and without outliers)
- Tree-based models (Random Forest, Gradient Boosting) handle outliers well
- Linear models are sensitive to outliers - consider using cleaned data
- Consider capping extreme values instead of removing them
""")
======================================================================
OUTLIER ANALYSIS - Concrete Dataset
======================================================================

======================================================================
1. STATISTICAL SUMMARY
======================================================================
       CementComponent  BlastFurnaceSlag  FlyAshComponent  WaterComponent  \
count      5407.000000       5407.000000      5407.000000     5407.000000   
mean        299.030418         58.251910        31.895506      185.117472   
std         105.107863         82.576886        54.478730       18.346339   
min         102.000000          0.000000         0.000000      121.800000   
25%         213.700000          0.000000         0.000000      175.500000   
50%         297.200000          0.000000         0.000000      188.448466   
75%         375.000000        121.000000        79.000000      192.000000   
max         540.000000        359.400000       200.100000      247.000000   

       SuperplasticizerComponent  CoarseAggregateComponent  \
count                5407.000000               5407.000000   
mean                    4.100082                992.359368   
std                     5.664028                 76.453119   
min                     0.000000                801.000000   
25%                     0.000000                940.000000   
50%                     0.000000                978.000000   
75%                     8.000000               1047.000000   
max                    32.200000               1145.000000   

       FineAggregateComponent    AgeInDays     Strength  
count             5407.000000  5407.000000  5407.000000  
mean               771.266154    52.027313    35.452071  
std                 78.081840    69.179240    16.401896  
min                594.000000     1.000000     2.330000  
25%                734.300000     7.000000    23.640000  
50%                781.200000    28.000000    33.950000  
75%                821.000000    56.000000    45.850000  
max                992.600000   365.000000    82.600000  

======================================================================
2. Z-SCORE METHOD (|z| > 3)
======================================================================

Outliers per feature (Z-score > 3):
CementComponent               :    0 outliers ( 0.00%)
BlastFurnaceSlag              :   20 outliers ( 0.37%)
FlyAshComponent               :    5 outliers ( 0.09%)
WaterComponent                :   34 outliers ( 0.63%)
SuperplasticizerComponent     :   69 outliers ( 1.28%)
CoarseAggregateComponent      :    0 outliers ( 0.00%)
FineAggregateComponent        :    0 outliers ( 0.00%)
AgeInDays                     :  193 outliers ( 3.57%)
Strength                      :    0 outliers ( 0.00%)

Total outlier data points: 321
Rows containing at least one outlier: 307 (5.68%)

======================================================================
3. IQR METHOD (beyond 1.5*IQR)
======================================================================
CementComponent               :    0 outliers ( 0.00%) [  -28.25,   616.95]
BlastFurnaceSlag              :   37 outliers ( 0.68%) [ -181.50,   302.50]
FlyAshComponent               :    5 outliers ( 0.09%) [ -118.50,   197.50]
WaterComponent                :  473 outliers ( 8.75%) [  150.75,   216.75]
SuperplasticizerComponent     :   72 outliers ( 1.33%) [  -12.00,    20.00]
CoarseAggregateComponent      :    0 outliers ( 0.00%) [  779.50,  1207.50]
FineAggregateComponent        :  146 outliers ( 2.70%) [  604.25,   951.05]
AgeInDays                     :  471 outliers ( 8.71%) [  -66.50,   129.50]
Strength                      :   33 outliers ( 0.61%) [   -9.67,    79.16]

Rows containing at least one outlier: 912 (16.87%)

======================================================================
4. BOXPLOT VISUALIZATION
======================================================================
No description has been provided for this image
======================================================================
5. DISTRIBUTION ANALYSIS
======================================================================
No description has been provided for this image
======================================================================
6. FEATURE vs STRENGTH (identifying influential outliers)
======================================================================
No description has been provided for this image
======================================================================
7. MULTIVARIATE OUTLIER DETECTION - Isolation Forest
======================================================================
Outliers detected by Isolation Forest: 541 (10.01%)
No description has been provided for this image
======================================================================
8. OUTLIER DETECTION SUMMARY
======================================================================
           Method  Outlier Rows Percentage
Z-Score (|z| > 3)           307      5.68%
    IQR (1.5*IQR)           912     16.87%
 Isolation Forest           541     10.01%

======================================================================
9. CREATING CLEANED DATASETS
======================================================================
Original dataset: 5407 rows
Dataset without IQR outliers: 4495 rows (removed 912 rows)
Dataset without Isolation Forest outliers: 4866 rows (removed 541 rows)

======================================================================
10. RECOMMENDATIONS
======================================================================

Based on the outlier analysis:

1. IQR Method: Most sensitive - detects feature-wise outliers
2. Z-Score: Conservative - only extreme statistical outliers
3. Isolation Forest: Detects multivariate outliers (anomalies in feature combinations)

NEXT STEPS:
✓ Created df_no_outliers_iqr (without IQR outliers)
✓ Created df_no_outliers_iso (without Isolation Forest outliers)

RECOMMENDATIONS:
- Test models on BOTH datasets (with and without outliers)
- Tree-based models (Random Forest, Gradient Boosting) handle outliers well
- Linear models are sensitive to outliers - consider using cleaned data
- Consider capping extreme values instead of removing them

In [24]:
# check distributions before and after imputation
for feature in df.columns:
    sns.kdeplot(df[feature], label='Original', color='blue')
    sns.kdeplot(df_imputed_knn[feature], label='KNN Imputed', color='orange')
    plt.title(f'Distribution of {feature} before and after KNN Imputation')
    plt.legend()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Feature engineering:¶

In [45]:
# Feature Engineering - Create new features for concrete strength prediction
import numpy as np
import pandas as pd

# Start with the imputed dataset
df_engineered = df_imputed_knn.copy()

print("Creating engineered features...")
print(f"Original features: {df_engineered.shape[1]}")

# 1. RAPPORTI TRA COMPONENTI (cruciali per la chimica del calcestruzzo)
# Water-to-Cement ratio (W/C) - uno dei fattori più importanti per la resistenza
df_engineered['Water_Cement_Ratio'] = df_engineered['WaterComponent'] / (df_engineered['CementComponent'] + 1e-10)

# Total Binder (materiali cementizi totali)
df_engineered['Total_Binder'] = (df_engineered['CementComponent'] + 
                                  df_engineered['BlastFurnaceSlag'] + 
                                  df_engineered['FlyAshComponent'])

# Water-to-Binder ratio (W/B)
df_engineered['Water_Binder_Ratio'] = df_engineered['WaterComponent'] / (df_engineered['Total_Binder'] + 1e-10)

# Fine-to-Coarse Aggregate ratio
df_engineered['Fine_Coarse_Ratio'] = df_engineered['FineAggregateComponent'] / (df_engineered['CoarseAggregateComponent'] + 1e-10)

# Superplasticizer-to-Cement ratio
df_engineered['Plasticizer_Cement_Ratio'] = df_engineered['SuperplasticizerComponent'] / (df_engineered['CementComponent'] + 1e-10)

# Supplementary Cementitious Materials (SCM) to Cement ratio
df_engineered['SCM_Cement_Ratio'] = (df_engineered['BlastFurnaceSlag'] + df_engineered['FlyAshComponent']) / (df_engineered['CementComponent'] + 1e-10)

# 2. SOMME E TOTALI
# Total Aggregate
df_engineered['Total_Aggregate'] = df_engineered['CoarseAggregateComponent'] + df_engineered['FineAggregateComponent']

# Total Weight (peso totale mix)
df_engineered['Total_Weight'] = (df_engineered['CementComponent'] + 
                                  df_engineered['BlastFurnaceSlag'] + 
                                  df_engineered['FlyAshComponent'] + 
                                  df_engineered['WaterComponent'] + 
                                  df_engineered['SuperplasticizerComponent'] + 
                                  df_engineered['CoarseAggregateComponent'] + 
                                  df_engineered['FineAggregateComponent'])

# Paste Volume (rapporto pasta cementizia vs aggregati)
df_engineered['Paste_Volume'] = (df_engineered['Total_Binder'] + df_engineered['WaterComponent']) / (df_engineered['Total_Weight'] + 1e-10)

# 3. PERCENTUALI (composizione del mix)
df_engineered['Cement_Percentage'] = df_engineered['CementComponent'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Water_Percentage'] = df_engineered['WaterComponent'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Aggregate_Percentage'] = df_engineered['Total_Aggregate'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Binder_Percentage'] = df_engineered['Total_Binder'] / (df_engineered['Total_Weight'] + 1e-10) * 100

# 4. FEATURE TEMPORALI (età del calcestruzzo)
# La resistenza del calcestruzzo cresce in modo logaritmico nel tempo
df_engineered['Age_Log'] = np.log1p(df_engineered['AgeInDays'])  # log(1 + x) per gestire 0
df_engineered['Age_Sqrt'] = np.sqrt(df_engineered['AgeInDays'])
df_engineered['Age_Squared'] = df_engineered['AgeInDays'] ** 2
df_engineered['Age_Cubed'] = df_engineered['AgeInDays'] ** 3

# Binning dell'età (categorizzazione)
df_engineered['Age_Young'] = (df_engineered['AgeInDays'] <= 7).astype(int)  # 0-7 giorni
df_engineered['Age_Medium'] = ((df_engineered['AgeInDays'] > 7) & (df_engineered['AgeInDays'] <= 28)).astype(int)  # 7-28 giorni
df_engineered['Age_Old'] = (df_engineered['AgeInDays'] > 28).astype(int)  # 28+ giorni

# 5. FEATURE BINARIE (presenza/assenza di componenti)
df_engineered['Has_FlyAsh'] = (df_engineered['FlyAshComponent'] > 0).astype(int)
df_engineered['Has_BlastFurnace'] = (df_engineered['BlastFurnaceSlag'] > 0).astype(int)
df_engineered['Has_Plasticizer'] = (df_engineered['SuperplasticizerComponent'] > 0).astype(int)

# 6. INTERAZIONI TRA COMPONENTI CHIAVE
# Cement x Age (il cemento guadagna resistenza nel tempo)
df_engineered['Cement_Age_Interaction'] = df_engineered['CementComponent'] * df_engineered['Age_Log']

# Water x Age
df_engineered['Water_Age_Interaction'] = df_engineered['WaterComponent'] * df_engineered['Age_Log']

# Binder x Age
df_engineered['Binder_Age_Interaction'] = df_engineered['Total_Binder'] * df_engineered['Age_Log']

# Water/Cement x Age
df_engineered['WC_Age_Interaction'] = df_engineered['Water_Cement_Ratio'] * df_engineered['Age_Log']

# 7. FEATURE QUADRATICHE (per catturare relazioni non lineari)
df_engineered['Cement_Squared'] = df_engineered['CementComponent'] ** 2
df_engineered['Water_Squared'] = df_engineered['WaterComponent'] ** 2
df_engineered['WC_Ratio_Squared'] = df_engineered['Water_Cement_Ratio'] ** 2

# 8. DENSITÀ E COMPATTEZZA
# Aggregate density
df_engineered['Aggregate_Density'] = df_engineered['Total_Aggregate'] / (df_engineered['Total_Weight'] + 1e-10)

# Binder efficiency (binder per unit of water)
df_engineered['Binder_Efficiency'] = df_engineered['Total_Binder'] / (df_engineered['WaterComponent'] + 1e-10)

# 9. COMBINAZIONI DI MATERIALI SUPPLEMENTARI
# Total SCM (Supplementary Cementitious Materials)
df_engineered['Total_SCM'] = df_engineered['BlastFurnaceSlag'] + df_engineered['FlyAshComponent']
df_engineered['SCM_Percentage'] = df_engineered['Total_SCM'] / (df_engineered['Total_Weight'] + 1e-10) * 100

# Fly Ash to Blast Furnace ratio
df_engineered['FlyAsh_BlastFurnace_Ratio'] = df_engineered['FlyAshComponent'] / (df_engineered['BlastFurnaceSlag'] + 1e-10)

print(f"Engineered features: {df_engineered.shape[1]}")
print(f"New features created: {df_engineered.shape[1] - df_imputed_knn.shape[1]}")
print("\nNew features list:")
new_features = [col for col in df_engineered.columns if col not in df_imputed_knn.columns]
for i, feat in enumerate(new_features, 1):
    print(f"{i}. {feat}")

# Visualize correlation of new features with Strength
print("\n" + "="*70)
print("CORRELATION OF NEW FEATURES WITH STRENGTH (Top 15)")
print("="*70)
new_features_corr = df_engineered[new_features + ['Strength']].corr()['Strength'].drop('Strength').sort_values(ascending=False)
print(new_features_corr.head(15))

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
top_features = new_features_corr.head(20)
colors = ['green' if x > 0 else 'red' for x in top_features.values]
plt.barh(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Correlation with Strength', fontsize=12, fontweight='bold')
plt.title('Top 20 New Features - Correlation with Concrete Strength', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
Creating engineered features...
Original features: 9
Engineered features: 44
New features created: 35

New features list:
1. Water_Cement_Ratio
2. Total_Binder
3. Water_Binder_Ratio
4. Fine_Coarse_Ratio
5. Plasticizer_Cement_Ratio
6. SCM_Cement_Ratio
7. Total_Aggregate
8. Total_Weight
9. Paste_Volume
10. Cement_Percentage
11. Water_Percentage
12. Aggregate_Percentage
13. Binder_Percentage
14. Age_Log
15. Age_Sqrt
16. Age_Squared
17. Age_Cubed
18. Age_Young
19. Age_Medium
20. Age_Old
21. Has_FlyAsh
22. Has_BlastFurnace
23. Has_Plasticizer
24. Cement_Age_Interaction
25. Water_Age_Interaction
26. Binder_Age_Interaction
27. WC_Age_Interaction
28. Cement_Squared
29. Water_Squared
30. WC_Ratio_Squared
31. Aggregate_Density
32. Binder_Efficiency
33. Total_SCM
34. SCM_Percentage
35. FlyAsh_BlastFurnace_Ratio

======================================================================
CORRELATION OF NEW FEATURES WITH STRENGTH (Top 15)
======================================================================
Binder_Age_Interaction    0.582406
Age_Log                   0.553642
Water_Age_Interaction     0.474287
Cement_Age_Interaction    0.473223
Age_Sqrt                  0.458170
Age_Old                   0.430362
Binder_Efficiency         0.253757
Binder_Percentage         0.238454
Total_Binder              0.237595
WC_Age_Interaction        0.230582
Paste_Volume              0.214263
Age_Squared               0.180880
Has_Plasticizer           0.174939
Cement_Squared            0.159666
Cement_Percentage         0.153418
Name: Strength, dtype: float64
No description has been provided for this image
In [ ]:
# redo the correlation matrix
#plt.figure(figsize=(10, 8))
#sns.heatmap(df_engineered.corr(), annot=True, fmt=".2f", cmap='coolwarm')
#plt.title("Correlation Matrix after KNN Imputation") 
Out[ ]:
Text(0.5, 1.0, 'Correlation Matrix after KNN Imputation')
In [52]:
import xgboost as xgb
In [53]:
# Create different predictors using sklearn and compare results on MSE and R2 score and use SHAP to explain models
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import shap
import pandas as pd
import matplotlib.pyplot as plt

# Try to import XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("✓ XGBoost is available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠ XGBoost not installed. Install with: pip install xgboost")

# Prepare data
X = df_imputed_knn.drop(columns=["Strength"])
y = df_imputed_knn["Strength"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to compare
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

# Add XGBoost if available
if XGBOOST_AVAILABLE:
    models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
    print("✓ XGBoost added to model comparison")

# Store results
results = []

# Train and evaluate each model
for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2
    })
    
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R^2 Score: {r2:.4f}")

# Create comparison dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R2 Score', ascending=False)
print("\n" + "="*70)
print("MODEL COMPARISON - Sorted by R2 Score")
print("="*70)
print(results_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R2 Score comparison
results_df.plot(x='Model', y='R2 Score', kind='barh', ax=axes[0, 0], legend=False, color='skyblue')
axes[0, 0].set_title('R² Score Comparison (Higher is Better)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('R² Score')
axes[0, 0].grid(axis='x', alpha=0.3)

# RMSE comparison
results_df.plot(x='Model', y='RMSE', kind='barh', ax=axes[0, 1], legend=False, color='salmon')
axes[0, 1].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('RMSE')
axes[0, 1].grid(axis='x', alpha=0.3)

# MSE comparison
results_df.plot(x='Model', y='MSE', kind='barh', ax=axes[1, 0], legend=False, color='lightgreen')
axes[1, 0].set_title('MSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('MSE')
axes[1, 0].grid(axis='x', alpha=0.3)

# MAE comparison
results_df.plot(x='Model', y='MAE', kind='barh', ax=axes[1, 1], legend=False, color='plum')
axes[1, 1].set_title('MAE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('MAE')
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()
✓ XGBoost is available
✓ XGBoost added to model comparison

==================================================
Training Decision Tree...
==================================================
Mean Squared Error: 231.3182
Root Mean Squared Error: 15.2091
Mean Absolute Error: 11.4152
R^2 Score: 0.1499

==================================================
Training Random Forest...
==================================================
Mean Squared Error: 156.2972
Root Mean Squared Error: 12.5019
Mean Absolute Error: 9.7730
R^2 Score: 0.4256

==================================================
Training Gradient Boosting...
==================================================
Mean Squared Error: 140.5207
Root Mean Squared Error: 11.8541
Mean Absolute Error: 9.2800
R^2 Score: 0.4836

==================================================
Training AdaBoost...
==================================================
Mean Squared Error: 163.9486
Root Mean Squared Error: 12.8042
Mean Absolute Error: 10.2720
R^2 Score: 0.3975

==================================================
Training Linear Regression...
==================================================
Mean Squared Error: 209.1183
Root Mean Squared Error: 14.4609
Mean Absolute Error: 11.5011
R^2 Score: 0.2315

==================================================
Training Ridge...
==================================================
Mean Squared Error: 209.1183
Root Mean Squared Error: 14.4609
Mean Absolute Error: 11.5011
R^2 Score: 0.2315

==================================================
Training Lasso...
==================================================
Mean Squared Error: 209.2316
Root Mean Squared Error: 14.4648
Mean Absolute Error: 11.5141
R^2 Score: 0.2311

==================================================
Training ElasticNet...
==================================================
Mean Squared Error: 209.1796
Root Mean Squared Error: 14.4630
Mean Absolute Error: 11.5089
R^2 Score: 0.2313

==================================================
Training SVR...
==================================================
Mean Squared Error: 222.5505
Root Mean Squared Error: 14.9181
Mean Absolute Error: 11.9723
R^2 Score: 0.1822

==================================================
Training KNN...
==================================================
Mean Squared Error: 186.1852
Root Mean Squared Error: 13.6450
Mean Absolute Error: 10.6636
R^2 Score: 0.3158

==================================================
Training XGBoost...
==================================================
Mean Squared Error: 159.3072
Root Mean Squared Error: 12.6217
Mean Absolute Error: 9.8598
R^2 Score: 0.4146

======================================================================
MODEL COMPARISON - Sorted by R2 Score
======================================================================
            Model        MSE      RMSE       MAE  R2 Score
Gradient Boosting 140.520654 11.854141  9.279957  0.483606
    Random Forest 156.297150 12.501886  9.773029  0.425629
          XGBoost 159.307239 12.621697  9.859786  0.414568
         AdaBoost 163.948580 12.804241 10.271958  0.397511
              KNN 186.185244 13.644971 10.663616  0.315795
            Ridge 209.118292 14.460923 11.501069  0.231519
Linear Regression 209.118292 14.460923 11.501068  0.231519
       ElasticNet 209.179574 14.463042 11.508877  0.231294
            Lasso 209.231627 14.464841 11.514120  0.231102
              SVR 222.550541 14.918128 11.972333  0.182157
    Decision Tree 231.318230 15.209150 11.415223  0.149937
No description has been provided for this image

Hyperparameter tuning¶

In [50]:
# HYPERPARAMETER TUNING - Gradient Boosting Optimization
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import time

print("="*70)
print("GRADIENT BOOSTING - HYPERPARAMETER TUNING")
print("="*70)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

# ============================================================================
# STEP 1: Baseline Model (default parameters)
# ============================================================================
print("\n" + "="*70)
print("STEP 1: BASELINE MODEL (default parameters)")
print("="*70)

baseline_model = GradientBoostingRegressor(random_state=42)
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)

baseline_mse = mean_squared_error(y_test, y_pred_baseline)
baseline_rmse = np.sqrt(baseline_mse)
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)
baseline_r2 = r2_score(y_test, y_pred_baseline)

print(f"\nBaseline Results:")
print(f"  MSE:  {baseline_mse:.4f}")
print(f"  RMSE: {baseline_rmse:.4f}")
print(f"  MAE:  {baseline_mae:.4f}")
print(f"  R²:   {baseline_r2:.4f}")

# ============================================================================
# STEP 2: Randomized Search (fast exploration)
# ============================================================================
print("\n" + "="*70)
print("STEP 2: RANDOMIZED SEARCH (exploring parameter space)")
print("="*70)

# Define parameter distribution for random search
param_distributions = {
    'n_estimators': [100, 200, 300, 500, 800, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

print("\nParameter space:")
for param, values in param_distributions.items():
    print(f"  {param:20s}: {values}")

print(f"\nRunning RandomizedSearchCV with 50 iterations and 5-fold CV...")
print("This may take several minutes...\n")

start_time = time.time()

random_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=param_distributions,
    n_iter=50,  # Try 50 random combinations
    cv=5,  # 5-fold cross-validation
    scoring='r2',
    n_jobs=-1,  # Use all CPU cores
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

random_search_time = time.time() - start_time

print(f"\n✓ RandomizedSearchCV completed in {random_search_time:.2f} seconds")
print(f"\nBest parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param:20s}: {value}")

print(f"\nBest CV R² score: {random_search.best_score_:.4f}")

# Evaluate on test set
y_pred_random = random_search.predict(X_test)
random_mse = mean_squared_error(y_test, y_pred_random)
random_rmse = np.sqrt(random_mse)
random_mae = mean_absolute_error(y_test, y_pred_random)
random_r2 = r2_score(y_test, y_pred_random)

print(f"\nTest set results (RandomizedSearch best model):")
print(f"  MSE:  {random_mse:.4f}")
print(f"  RMSE: {random_rmse:.4f}")
print(f"  MAE:  {random_mae:.4f}")
print(f"  R²:   {random_r2:.4f}")

# ============================================================================
# STEP 3: Grid Search (fine-tuning around best parameters)
# ============================================================================
print("\n" + "="*70)
print("STEP 3: GRID SEARCH (fine-tuning best parameters)")
print("="*70)

# Create a narrow grid around the best parameters
best_params = random_search.best_params_

# Define narrow ranges around best values
grid_param = {
    'n_estimators': [max(100, best_params['n_estimators'] - 100), 
                     best_params['n_estimators'], 
                     best_params['n_estimators'] + 100],
    'learning_rate': [max(0.01, best_params['learning_rate'] - 0.02), 
                      best_params['learning_rate'], 
                      min(0.3, best_params['learning_rate'] + 0.02)],
    'max_depth': [max(3, best_params['max_depth'] - 1), 
                  best_params['max_depth'], 
                  best_params['max_depth'] + 1],
    'min_samples_split': [best_params['min_samples_split']],
    'min_samples_leaf': [best_params['min_samples_leaf']],
    'subsample': [best_params['subsample']],
    'max_features': [best_params['max_features']]
}

print("\nFine-tuning grid:")
for param, values in grid_param.items():
    print(f"  {param:20s}: {values}")

print(f"\nRunning GridSearchCV with 5-fold CV...")
print("This may take a few minutes...\n")

start_time = time.time()

grid_search = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=grid_param,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

grid_search_time = time.time() - start_time

print(f"\n✓ GridSearchCV completed in {grid_search_time:.2f} seconds")
print(f"\nFinal best parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param:20s}: {value}")

print(f"\nBest CV R² score: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred_grid = grid_search.predict(X_test)
grid_mse = mean_squared_error(y_test, y_pred_grid)
grid_rmse = np.sqrt(grid_mse)
grid_mae = mean_absolute_error(y_test, y_pred_grid)
grid_r2 = r2_score(y_test, y_pred_grid)

print(f"\nTest set results (GridSearch best model):")
print(f"  MSE:  {grid_mse:.4f}")
print(f"  RMSE: {grid_rmse:.4f}")
print(f"  MAE:  {grid_mae:.4f}")
print(f"  R²:   {grid_r2:.4f}")

# ============================================================================
# STEP 4: Comparison and Visualization
# ============================================================================
print("\n" + "="*70)
print("STEP 4: MODEL COMPARISON")
print("="*70)

comparison_data = {
    'Model': ['Baseline (default)', 'RandomizedSearch', 'GridSearch (final)'],
    'MSE': [baseline_mse, random_mse, grid_mse],
    'RMSE': [baseline_rmse, random_rmse, grid_rmse],
    'MAE': [baseline_mae, random_mae, grid_mae],
    'R² Score': [baseline_r2, random_r2, grid_r2]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + comparison_df.to_string(index=False))

# Calculate improvements
r2_improvement = ((grid_r2 - baseline_r2) / baseline_r2) * 100
rmse_improvement = ((baseline_rmse - grid_rmse) / baseline_rmse) * 100

print(f"\n📈 IMPROVEMENTS:")
print(f"  R² improvement:   {r2_improvement:+.2f}%")
print(f"  RMSE improvement: {rmse_improvement:+.2f}%")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R² comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['R² Score'], color=['gray', 'skyblue', 'green'], alpha=0.7)
axes[0, 0].set_title('R² Score Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('R² Score')
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].set_ylim(0, 1)
for i, v in enumerate(comparison_df['R² Score']):
    axes[0, 0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# RMSE comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['RMSE'], color=['gray', 'skyblue', 'green'], alpha=0.7)
axes[0, 1].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('RMSE')
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(comparison_df['RMSE']):
    axes[0, 1].text(i, v + 0.2, f'{v:.2f}', ha='center', fontweight='bold')

# Actual vs Predicted (GridSearch model)
axes[1, 0].scatter(y_test, y_pred_grid, alpha=0.5, s=30, color='green')
axes[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect prediction')
axes[1, 0].set_xlabel('Actual Strength', fontsize=11)
axes[1, 0].set_ylabel('Predicted Strength', fontsize=11)
axes[1, 0].set_title(f'Actual vs Predicted (Final Model)\nR² = {grid_r2:.4f}', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Residuals plot
residuals = y_test - y_pred_grid
axes[1, 1].scatter(y_pred_grid, residuals, alpha=0.5, s=30, color='purple')
axes[1, 1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Predicted Strength', fontsize=11)
axes[1, 1].set_ylabel('Residuals', fontsize=11)
axes[1, 1].set_title('Residual Plot (Final Model)', fontsize=12, fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# ============================================================================
# STEP 5: Feature Importance (Final Model)
# ============================================================================
print("\n" + "="*70)
print("STEP 5: FEATURE IMPORTANCE (Final Model)")
print("="*70)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': grid_search.best_estimator_.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + feature_importance.to_string(index=False))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='teal', alpha=0.7)
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.title('Feature Importance - Optimized Gradient Boosting', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# ============================================================================
# STEP 6: Save Best Model
# ============================================================================
print("\n" + "="*70)
print("STEP 6: BEST MODEL SUMMARY")
print("="*70)

best_model = grid_search.best_estimator_

print(f"\n✓ Best Gradient Boosting Model:")
print(f"  - R² Score: {grid_r2:.4f}")
print(f"  - RMSE: {grid_rmse:.4f}")
print(f"  - MAE: {grid_mae:.4f}")
print(f"\n✓ Model saved as: best_model")
print(f"\nTo use this model for predictions:")
print(f"  predictions = best_model.predict(new_data)")

print("\n" + "="*70)
print("HYPERPARAMETER TUNING COMPLETE!")
print("="*70)
======================================================================
GRADIENT BOOSTING - HYPERPARAMETER TUNING
======================================================================

Training set: 4325 samples
Test set: 1082 samples
Features: 43

======================================================================
STEP 1: BASELINE MODEL (default parameters)
======================================================================

Baseline Results:
  MSE:  140.6489
  RMSE: 11.8595
  MAE:  9.2451
  R²:   0.4831

======================================================================
STEP 2: RANDOMIZED SEARCH (exploring parameter space)
======================================================================

Parameter space:
  n_estimators        : [100, 200, 300, 500, 800, 1000]
  learning_rate       : [0.01, 0.05, 0.1, 0.15, 0.2]
  max_depth           : [3, 4, 5, 6, 7, 8, 10]
  min_samples_split   : [2, 5, 10, 20]
  min_samples_leaf    : [1, 2, 4, 8]
  subsample           : [0.7, 0.8, 0.9, 1.0]
  max_features        : ['sqrt', 'log2', None]

Running RandomizedSearchCV with 50 iterations and 5-fold CV...
This may take several minutes...

Fitting 5 folds for each of 50 candidates, totalling 250 fits

✓ RandomizedSearchCV completed in 211.88 seconds

Best parameters found:
  subsample           : 0.7
  n_estimators        : 300
  min_samples_split   : 10
  min_samples_leaf    : 8
  max_features        : log2
  max_depth           : 3
  learning_rate       : 0.05

Best CV R² score: 0.4393

Test set results (RandomizedSearch best model):
  MSE:  140.1357
  RMSE: 11.8379
  MAE:  9.2414
  R²:   0.4850

======================================================================
STEP 3: GRID SEARCH (fine-tuning best parameters)
======================================================================

Fine-tuning grid:
  n_estimators        : [200, 300, 400]
  learning_rate       : [0.030000000000000002, 0.05, 0.07]
  max_depth           : [3, 3, 4]
  min_samples_split   : [10]
  min_samples_leaf    : [8]
  subsample           : [0.7]
  max_features        : ['log2']

Running GridSearchCV with 5-fold CV...
This may take a few minutes...

Fitting 5 folds for each of 27 candidates, totalling 135 fits

✓ GridSearchCV completed in 13.27 seconds

Final best parameters:
  learning_rate       : 0.05
  max_depth           : 3
  max_features        : log2
  min_samples_leaf    : 8
  min_samples_split   : 10
  n_estimators        : 200
  subsample           : 0.7

Best CV R² score: 0.4408

Test set results (GridSearch best model):
  MSE:  139.6006
  RMSE: 11.8153
  MAE:  9.2221
  R²:   0.4870

======================================================================
STEP 4: MODEL COMPARISON
======================================================================

             Model        MSE      RMSE      MAE  R² Score
Baseline (default) 140.648870 11.859548 9.245056  0.483135
  RandomizedSearch 140.135658 11.837891 9.241365  0.485021
GridSearch (final) 139.600631 11.815271 9.222091  0.486987

📈 IMPROVEMENTS:
  R² improvement:   +0.80%
  RMSE improvement: +0.37%
No description has been provided for this image
======================================================================
STEP 5: FEATURE IMPORTANCE (Final Model)
======================================================================

                  Feature  Importance
                Age_Cubed    0.198850
                 Age_Sqrt    0.102132
                AgeInDays    0.091048
    Water_Age_Interaction    0.075758
   Binder_Age_Interaction    0.057597
                Age_Young    0.053072
              Age_Squared    0.042254
   Cement_Age_Interaction    0.030969
       Water_Binder_Ratio    0.026563
                  Age_Log    0.021339
       WC_Age_Interaction    0.020321
        Cement_Percentage    0.018542
        Aggregate_Density    0.016734
           Cement_Squared    0.015845
SuperplasticizerComponent    0.015096
            Water_Squared    0.013405
         Water_Percentage    0.013304
             Total_Binder    0.012382
        Binder_Efficiency    0.012258
     Aggregate_Percentage    0.011553
           WaterComponent    0.011433
               Age_Medium    0.010439
          Has_Plasticizer    0.009954
        Fine_Coarse_Ratio    0.009917
   FineAggregateComponent    0.009440
          CementComponent    0.009254
             Total_Weight    0.008516
           SCM_Percentage    0.007428
 Plasticizer_Cement_Ratio    0.007323
         WC_Ratio_Squared    0.007029
                  Age_Old    0.006670
 CoarseAggregateComponent    0.006518
        Binder_Percentage    0.006285
                Total_SCM    0.006252
FlyAsh_BlastFurnace_Ratio    0.006083
          Total_Aggregate    0.005914
             Paste_Volume    0.005531
       Water_Cement_Ratio    0.004984
         BlastFurnaceSlag    0.003315
         SCM_Cement_Ratio    0.003227
          FlyAshComponent    0.002695
         Has_BlastFurnace    0.001829
               Has_FlyAsh    0.000940
No description has been provided for this image
======================================================================
STEP 6: BEST MODEL SUMMARY
======================================================================

✓ Best Gradient Boosting Model:
  - R² Score: 0.4870
  - RMSE: 11.8153
  - MAE: 9.2221

✓ Model saved as: best_model

To use this model for predictions:
  predictions = best_model.predict(new_data)

======================================================================
HYPERPARAMETER TUNING COMPLETE!
======================================================================
In [54]:
# save best model for streamlit app
import joblib
joblib.dump(best_model, 'best_gradient_boosting_model.pkl')
Out[54]:
['best_gradient_boosting_model.pkl']
In [56]:
# Save the best model and data for Streamlit app
import pickle
import joblib

print("="*70)
print("SAVING BEST MODEL FOR STREAMLIT APP")
print("="*70)

# Retrain on full data for best performance
best_model.fit(X_train, y_train)

print(f"\nBest Model: Gradient Boosting Regressor Optimized")
print(f"R² Score: {results_df.iloc[0]['R2 Score']:.4f}")

# Save model
joblib.dump(best_model, 'best_concrete_model.pkl')
print(f"\n✓ Model saved as: best_concrete_model.pkl")

# Save feature names
feature_names = X_train.columns.tolist()
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)
print(f"✓ Feature names saved as: feature_names.pkl")

# Save training data for SHAP (sample for speed)
X_train_sample = X_train.sample(n=min(100, len(X_train)), random_state=42)
joblib.dump(X_train_sample, 'X_train_sample.pkl')
print(f"✓ Training sample saved as: X_train_sample.pkl")

# Save statistics for input validation
feature_stats = df_imputed_knn.describe().to_dict()
with open('feature_stats.pkl', 'wb') as f:
    pickle.dump(feature_stats, f)
print(f"✓ Feature statistics saved as: feature_stats.pkl")

print("\n" + "="*70)
print("FILES READY FOR STREAMLIT APP")
print("="*70)
print("\nSaved files:")
print("  1. best_concrete_model.pkl - Trained model")
print("  2. feature_names.pkl - Feature names")
print("  3. X_train_sample.pkl - Training data for SHAP")
print("  4. feature_stats.pkl - Feature statistics")
print("\nNext: Run the Streamlit app with: streamlit run app.")
======================================================================
SAVING BEST MODEL FOR STREAMLIT APP
======================================================================

Best Model: Gradient Boosting Regressor Optimized
R² Score: 0.4836

✓ Model saved as: best_concrete_model.pkl
✓ Feature names saved as: feature_names.pkl
✓ Training sample saved as: X_train_sample.pkl
✓ Feature statistics saved as: feature_stats.pkl

======================================================================
FILES READY FOR STREAMLIT APP
======================================================================

Saved files:
  1. best_concrete_model.pkl - Trained model
  2. feature_names.pkl - Feature names
  3. X_train_sample.pkl - Training data for SHAP
  4. feature_stats.pkl - Feature statistics

Next: Run the Streamlit app with: streamlit run app.
In [58]:
%pip install nbconvert
Collecting nbconvert
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Requirement already satisfied: beautifulsoup4 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (4.14.2)
Requirement already satisfied: bleach!=5.0.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach[css]!=5.0.0->nbconvert) (6.3.0)
Requirement already satisfied: defusedxml in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (0.7.1)
Requirement already satisfied: jinja2>=3.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.1.6)
Requirement already satisfied: jupyter-core>=4.7 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (5.9.1)
Requirement already satisfied: jupyterlab-pygments in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (0.3.0)
Requirement already satisfied: markupsafe>=2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.0.3)
Requirement already satisfied: mistune<4,>=2.0.3 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.1.4)
Collecting nbclient>=0.5.0 (from nbconvert)
  Using cached nbclient-0.10.2-py3-none-any.whl.metadata (8.3 kB)
Collecting nbformat>=5.7 (from nbconvert)
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Requirement already satisfied: packaging in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (25.0)
Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (1.5.1)
Requirement already satisfied: pygments>=2.4.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (2.19.2)
Requirement already satisfied: traitlets>=5.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (5.14.3)
Requirement already satisfied: webencodings in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert) (0.5.1)
Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach[css]!=5.0.0->nbconvert) (1.4.0)
Requirement already satisfied: platformdirs>=2.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-core>=4.7->nbconvert) (4.5.0)
Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbclient>=0.5.0->nbconvert) (8.6.3)
Requirement already satisfied: fastjsonschema>=2.15 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbformat>=5.7->nbconvert) (2.21.2)
Requirement already satisfied: jsonschema>=2.6 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbformat>=5.7->nbconvert) (4.23.0)
Requirement already satisfied: soupsieve>1.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from beautifulsoup4->nbconvert) (2.8)
Requirement already satisfied: typing-extensions>=4.0.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from beautifulsoup4->nbconvert) (4.15.0)
Requirement already satisfied: attrs>=22.2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (25.4.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (2025.9.1)
Requirement already satisfied: referencing>=0.28.4 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.37.0)
Requirement already satisfied: rpds-py>=0.7.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.28.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.9.0.post0)
Requirement already satisfied: pyzmq>=23.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (27.1.0)
Requirement already satisfied: tornado>=6.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.5.2)
Requirement already satisfied: six>=1.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (1.17.0)
Using cached nbconvert-7.16.6-py3-none-any.whl (258 kB)
Using cached nbclient-0.10.2-py3-none-any.whl (25 kB)
Using cached nbformat-5.10.4-py3-none-any.whl (78 kB)
Installing collected packages: nbformat, nbclient, nbconvert
Successfully installed nbclient-0.10.2 nbconvert-7.16.6 nbformat-5.10.4
Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

Shap¶

In [40]:
# SHAP Analysis for top 3 models
print("\n" + "="*70)
print("SHAP ANALYSIS - Explaining Model Predictions")
print("="*70)

# Get top 3 models based on R2 score
top_3_models = results_df.head(3)['Model'].tolist()

# Re-train top models and create SHAP explanations
for model_name in top_3_models:
    print(f"\n{'='*50}")
    print(f"SHAP Analysis for {model_name}")
    print(f"{'='*50}")
    
    # Get the model
    model = models[model_name]
    model.fit(X_train, y_train)
    
    # Create SHAP explainer
    try:
        if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
            # Tree-based models can use TreeExplainer
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
        else:
            # For other models, use KernelExplainer (slower but works for any model)
            # Use a smaller background dataset for faster computation
            background = shap.sample(X_train, 100)
            explainer = shap.KernelExplainer(model.predict, background)
            shap_values = explainer.shap_values(X_test[:100])  # Limit test samples for speed
            X_test_sample = X_test[:100]
        
        # Summary plot (bar)
        plt.figure(figsize=(10, 6))
        if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
            shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
        else:
            shap.summary_plot(shap_values, X_test_sample, plot_type="bar", show=False)
        plt.title(f'{model_name} - Feature Importance (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Summary plot (beeswarm)
        plt.figure(figsize=(10, 6))
        if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
            shap.summary_plot(shap_values, X_test, show=False)
        else:
            shap.summary_plot(shap_values, X_test_sample, show=False)
        plt.title(f'{model_name} - Feature Impact on Predictions (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Waterfall plot for first prediction
        plt.figure(figsize=(10, 6))
        if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
            shap.waterfall_plot(shap.Explanation(values=shap_values[0], 
                                                  base_values=explainer.expected_value, 
                                                  data=X_test.iloc[0],
                                                  feature_names=X_test.columns.tolist()),
                               show=False)
        else:
            shap.waterfall_plot(shap.Explanation(values=shap_values[0], 
                                                  base_values=explainer.expected_value, 
                                                  data=X_test_sample.iloc[0],
                                                  feature_names=X_test.columns.tolist()),
                               show=False)
        plt.title(f'{model_name} - Single Prediction Explanation (First Test Sample)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error creating SHAP explanation for {model_name}: {str(e)}")
        print("Skipping SHAP analysis for this model...")
        continue

print("\n" + "="*70)
print("ANALYSIS COMPLETE")
print("="*70)
======================================================================
SHAP ANALYSIS - Explaining Model Predictions
======================================================================

==================================================
SHAP Analysis for Gradient Boosting
==================================================
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==================================================
SHAP Analysis for Random Forest
==================================================
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==================================================
SHAP Analysis for AdaBoost
==================================================
100%|██████████| 100/100 [00:03<00:00, 28.22it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
======================================================================
ANALYSIS COMPLETE
======================================================================
In [ ]:
import autogluon.features.
autogluon.eda.auto.target_analysis(train_data=df, label="Strength")
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[20], line 2
      1 import autogluon
----> 2 autogluon.eda.auto.target_analysis(train_data=df, label="Strength")

AttributeError: module 'autogluon' has no attribute 'eda'
In [16]:
import autogluon.eda as auto

auto.target_analysis(train_data=df_train, label=target_col)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[16], line 1
----> 1 import autogluon.eda as auto
      3 auto.target_analysis(train_data=df_train, label=target_col)

ModuleNotFoundError: No module named 'autogluon.eda'

Correlazione¶

In [14]:
sns.heatmap(df.corr(), annot=True)

plt.show()
No description has been provided for this image

Feature Engineering¶

Train test split and prediction¶

In [24]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

# Load your data
data = TabularDataset(df)

# Split the data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save splits to separate files (optional)
train_data.to_csv('train.csv')
test_data.to_csv('test.csv')

# Define label column
label = 'Strength'

# Train the model
predictor = TabularPredictor(label=label).fit(train_data)

# Make predictions on test data
predictions = predictor.predict(test_data)

# Evaluate performance
performance = predictor.evaluate(test_data)
print(f'Model performance on test data: {performance}')
No path specified. Models will be saved in: "AutogluonModels\ag-20251109_124701"
Verbosity: 2 (Standard Logging)
=================== System Info ===================
AutoGluon Version:  1.4.0
Python Version:     3.12.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          22
Memory Avail:       11.91 GB / 31.47 GB (37.9%)
Disk Space Avail:   560.29 GB / 952.63 GB (58.8%)
===================================================
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'    : Strong accuracy with fast inference speed.
	presets='good'    : Good accuracy with very fast inference speed.
	presets='medium'  : Fast training time, ideal for initial prototyping.
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ...
AutoGluon will save models to "c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\AutogluonModels\ag-20251109_124701"
Train Data Rows:    4325
Train Data Columns: 8
Label Column:       Strength
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (82.6, 2.33, 35.61067, 16.37442)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12133.86 MB
	Train Data (Original)  Memory Usage: 0.26 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 8 | ['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 8 | ['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', ...]
	0.0s = Fit runtime
	8 features in original data used to generate 8 features in processed data.
	Train Data (Processed) Memory Usage: 0.26 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.03s ...
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.11560693641618497, Train Rows: 3825, Val Rows: 500
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
	'CAT': [{}],
	'XGB': [{}],
	'FASTAI': [{}],
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
}
Fitting 9 L1 models, fit_strategy="sequential" ...
Fitting model: LightGBMXT ...
	Fitting with cpus=16, gpus=0, mem=0.0/11.9 GB
	-12.8225	 = Validation score   (-root_mean_squared_error)
	10.11s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ...
	Fitting with cpus=16, gpus=0, mem=0.0/11.8 GB
	-12.72	 = Validation score   (-root_mean_squared_error)
	1.15s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestMSE ...
	Fitting with cpus=22, gpus=0
	-13.5689	 = Validation score   (-root_mean_squared_error)
	1.36s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: CatBoost ...
	Fitting with cpus=16, gpus=0
	-12.6782	 = Validation score   (-root_mean_squared_error)
	5.17s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	Fitting with cpus=22, gpus=0
	-13.6565	 = Validation score   (-root_mean_squared_error)
	0.56s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	Fitting with cpus=16, gpus=0, mem=0.0/11.3 GB
	-12.7561	 = Validation score   (-root_mean_squared_error)
	4.87s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	Fitting with cpus=16, gpus=0
	-12.8497	 = Validation score   (-root_mean_squared_error)
	0.86s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	Fitting with cpus=16, gpus=0, mem=0.0/11.6 GB
c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Lib\site-packages\sklearn\compose\_column_transformer.py:975: FutureWarning: The parameter `force_int_remainder_cols` is deprecated and will be removed in 1.9. It has no effect. Leave it to its default value to avoid this warning.
  warnings.warn(
	-12.7746	 = Validation score   (-root_mean_squared_error)
	10.16s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...
	Fitting with cpus=16, gpus=0, mem=0.0/11.2 GB
	-13.1556	 = Validation score   (-root_mean_squared_error)
	2.59s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'NeuralNetFastAI': 0.333, 'NeuralNetTorch': 0.292, 'CatBoost': 0.167, 'RandomForestMSE': 0.125, 'LightGBM': 0.083}
	-12.5293	 = Validation score   (-root_mean_squared_error)
	0.03s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 37.72s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 4322.3 rows/s (500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\AutogluonModels\ag-20251109_124701")
Model performance on test data: {'root_mean_squared_error': np.float64(-12.223556126124398), 'mean_squared_error': -149.4153243685133, 'mean_absolute_error': -9.472742944107477, 'r2': 0.45091903272671363, 'pearsonr': 0.6717917835193142, 'median_absolute_error': -7.4079645538330094}
In [34]:
df['Strength'].describe()
Out[34]:
count    5407.000000
mean       35.452071
std        16.401896
min         2.330000
25%        23.640000
50%        33.950000
75%        45.850000
max        82.600000
Name: Strength, dtype: float64
In [25]:
performance
Out[25]:
{'root_mean_squared_error': np.float64(-12.223556126124398),
 'mean_squared_error': -149.4153243685133,
 'mean_absolute_error': -9.472742944107477,
 'r2': 0.45091903272671363,
 'pearsonr': 0.6717917835193142,
 'median_absolute_error': -7.4079645538330094}
In [26]:
predictor.leaderboard(test_data, silent=True)
Out[26]:
model score_test score_val eval_metric pred_time_test pred_time_val fit_time pred_time_test_marginal pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 CatBoost -12.186064 -12.678166 root_mean_squared_error 0.015701 0.000000 5.166798 0.015701 0.000000 5.166798 1 True 4
1 WeightedEnsemble_L2 -12.223556 -12.529300 root_mean_squared_error 0.252403 0.115678 22.729485 0.007087 0.000000 0.027645 2 True 10
2 LightGBMXT -12.230122 -12.822526 root_mean_squared_error 0.009604 0.009000 10.112404 0.009604 0.009000 10.112404 1 True 1
3 LightGBM -12.352122 -12.720001 root_mean_squared_error 0.002999 0.009243 1.150662 0.002999 0.009243 1.150662 1 True 2
4 NeuralNetTorch -12.385383 -12.774633 root_mean_squared_error 0.016525 0.016654 10.162089 0.016525 0.016654 10.162089 1 True 8
5 XGBoost -12.453361 -12.849735 root_mean_squared_error 0.023535 0.009510 0.863003 0.023535 0.009510 0.863003 1 True 7
6 NeuralNetFastAI -12.608055 -12.756147 root_mean_squared_error 0.029259 0.019957 4.866769 0.029259 0.019957 4.866769 1 True 6
7 LightGBMLarge -12.696348 -13.155580 root_mean_squared_error 0.008000 0.009005 2.585604 0.008000 0.009005 2.585604 1 True 9
8 ExtraTreesMSE -13.092905 -13.656455 root_mean_squared_error 0.186489 0.058352 0.555173 0.186489 0.058352 0.555173 1 True 5
9 RandomForestMSE -13.124074 -13.568943 root_mean_squared_error 0.180832 0.069824 1.355521 0.180832 0.069824 1.355521 1 True 3
In [ ]:
predictor
In [ ]:
 
In [ ]:
 

Shapely plots¶

In [29]:
# install shap in the notebook environment
%pip install -q shap --upgrade# install shap in the notebook environment
Note: you may need to restart the kernel to use updated packages.
Usage:   
  c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] <archive url/path> ...

no such option: --upgrade#
In [ ]:
from autogluon.eda import analysis
import os
from IPython.display import IFrame, display

# Run AutoGluon EDA to produce an interactive report for the training data

outdir = "ag_eda_report"
ana = analysis.Analysis(train_data, label=label, problem_type=None, output_dir=outdir)
ana.run()

index_path = os.path.join(outdir, "index.html")
if os.path.exists(index_path):
    display(IFrame(index_path, width=1000, height=600))
else:
    print(f"EDA report created in: {os.path.abspath(outdir)} (no index.html found)")
In [32]:
%pip install shap
Collecting shap
  Using cached shap-0.49.1-cp312-cp312-win_amd64.whl.metadata (25 kB)
Requirement already satisfied: numpy in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (2.1.3)
Requirement already satisfied: scipy in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (1.16.3)
Requirement already satisfied: scikit-learn in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (1.7.2)
Requirement already satisfied: pandas in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (2.3.3)
Requirement already satisfied: tqdm>=4.27.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (4.67.1)
Requirement already satisfied: packaging>20.9 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (25.0)
Requirement already satisfied: slicer==0.0.8 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (0.0.8)
Requirement already satisfied: numba>=0.54 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (0.62.1)
Requirement already satisfied: cloudpickle in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (3.1.2)
Requirement already satisfied: typing-extensions in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (4.15.0)
Requirement already satisfied: llvmlite<0.46,>=0.45.0dev0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from numba>=0.54->shap) (0.45.1)
Requirement already satisfied: colorama in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2025.2)
Requirement already satisfied: joblib>=1.2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from scikit-learn->shap) (1.5.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from scikit-learn->shap) (3.6.0)
Requirement already satisfied: six>=1.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.17.0)
Using cached shap-0.49.1-cp312-cp312-win_amd64.whl (548 kB)
Installing collected packages: shap
Successfully installed shap-0.49.1
Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
In [39]:
import shap

explainer = shap.Explainer("Catboost", train_data.drop(columns=[label]))
shap_values = explainer(test_data.drop(columns=[label]))
shap.summary_plot(shap_values)
shap.summary_plot(shap_values)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[39], line 3
      1 import shap
----> 3 explainer = shap.Explainer("Catboost", train_data.drop(columns=[label]))
      4 shap_values = explainer(test_data.drop(columns=[label]))
      5 shap.summary_plot(shap_values)

File c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Lib\site-packages\shap\explainers\_explainer.py:204, in Explainer.__init__(self, model, masker, link, algorithm, output_names, feature_names, linearize_link, seed, **kwargs)
    200             algorithm = "permutation"
    202     # if we get here then we don't know how to handle what was given to us
    203     else:
--> 204         raise TypeError(
    205             "The passed model is not callable and cannot be analyzed directly with the given masker! Model: "
    206             + str(model)
    207         )
    209 # build the right subclass
    210 if algorithm == "exact":

TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: Catboost
In [38]:
predictor.model_best
Out[38]:
'WeightedEnsemble_L2'